The purpose of this workflow is to look at the genes that have differential states across strains at the TSS.
Here we look at which states are represented at the TSS, and which genes have variation there.
We look for genes for which all strains have a single state at the TSS, and genes for which two states are represented across the strains.
We count the number of genes with each state combination at the TSS. We calculate functional enrichments for each group of genes. And we look at expression patterns for each group of genes.
library(here);library(gprofiler2);library(pheatmap);library(fgsea)
num.states = 9
is.interactive = FALSE
#is.interactive = TRUE
form.states <- paste0(num.states, "_states_C")
data.dir <- here("Data", "ChromHMM", form.states)
results.dir <- here("Results", "ChromHMM", form.states)
all.code.dir <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.code.dir)){
all.fun <- list.files(all.code.dir[i], full.names = TRUE)
for(j in 1:length(all.fun)){source(all.fun[j])}
}
transcript.info <- readRDS(here("Data", "RNASeq", "RNASeq_gene_info.RData"))
chrom.mats <- readRDS(file.path(results.dir, "Chromatin.States.Gene.Coords.RDS"))
#chrom.mats <- readRDS(file.path(results.dir, "Chromatin_States_9_full_gene_1000.RData"))
transformed.data.file <- here("Data", "RNASeq", "StrainsEffCts9_vst.RDS")
#expr <- readRDS(transformed.data.file)
expr <- readRDS(here("Data","RNASeq", "Strain_Mean_C_Expression.RData"))
transcript.info.file <- here("Data", "RNASeq", "RNASeq_gene_info.RData")
transcript.info <- readRDS(transcript.info.file)
col.table <- as.matrix(read.table(here("Data", "support_files", "strain.color.table.txt"),
sep = "\t", comment.char = "%", stringsAsFactors = FALSE))
Find the states at the TSS of each transcript
get_tss_states <- function(chrom.mat){
if(length(chrom.mat) == 1){return(rep(NA, 9))}
tss.idx <- get.nearest.pt(as.numeric(rownames(chrom.mat)), 0)
return(chrom.mat[tss.idx,,drop=FALSE])
}
tss.states <- t(sapply(chrom.mats, get_tss_states))
colnames(tss.states) <- colnames(chrom.mats[[1]])
#tally which states are present there
state.counts <- t(apply(tss.states, 1,
function(y) sapply(1:num.states, function(x) length(which(y == x)))))
colnames(state.counts) <- 1:num.states
#find all the genes with variation
var.genes <- which(apply(state.counts, 1, function(x) length(which(x != 0)) > 1))
#this code looks at functional enrichment for genes marked with each
#state across the strains
state.enrich.file <- file.path(results.dir, "Enrichment.by.State.and.Strain.RDS")
if(!file.exists(state.enrich.file)){
st.enrich <- vector(mode = "list", length = ncol(tss.states))
names(st.enrich) <- 1:num.states
for(st in 1:num.states){
if(is.interactive){print(st)}
st.genes <- apply(tss.states, 2, function(x) which(x == st))
st.enrich[[st]] <- lapply(st.genes, function(x) gost(names(x), organism = "mmusculus",
sources = c("GO", "REACTOME", "KEGG")))
}
saveRDS(st.enrich, state.enrich.file)
}else{
st.enrich <- readRDS(state.enrich.file)
for(st in 1:num.states){
cat("### State", st, "\n")
plot.enrichment.group(st.enrich[[st]], max.term.size = 2000,
sort.by = "p_value")
cat("\n\n")
}
}
There are 4210 out of 14048 total genes with variation at the TSS. The figure below shows genes in rows and states in columns. Each cell contains the number of strains that had each state at the TSS for that gene. State 7 is the most abundant state at the TSS even for genes with variation at this position. State 1 is the next most abundant.
pheatmap(state.counts, show_rownames = FALSE)
The figure below shows which states tend to co-occur at the TSS. Most of the states are negatively correlated with each other. State 7 tends to co-occur with state 8, but is very negatively correlated with state 1.
States 1, 2, and 3, all negative states, tend to co-occur, and states 4 and 5 tend to co-occur.
cor.mat <- state.counts
cor.mat[which(state.counts > 0)] <- 1
state_pairs <- pair.matrix(1:num.states, self.pairs = TRUE)
pair.names <- apply(state_pairs, 1, function(x) paste(x, collapse = "_"))
all.pair.co.occur <- vector(mode = "list", length = nrow(state_pairs))
names(all.pair.co.occur) <- pair.names
pair.cor <- matrix(NA, nrow = num.states, ncol = num.states)
rownames(pair.cor) <- colnames(pair.cor) <- 1:num.states
paired.tss.expr <- vector(mode = "list", length = nrow(state_pairs))
names(paired.tss.expr) <- pair.names
for(i in 1:nrow(state_pairs)){
state1 <- state_pairs[i,1]
state2 <- state_pairs[i,2]
if(state1 == state2){
state1.locale <- state2.locale <- which(state.counts[,state1] == ncol(state.counts))
}else{
state1.locale <- which(cor.mat[,state1] > 0)
state2.locale <- which(cor.mat[,state2] > 0)
}
both.together <- intersect(state1.locale, state2.locale)
group.genes <- rownames(cor.mat)[both.together]
all.pair.co.occur[[i]] <- group.genes
#get expression for these genes
expr.locale <- match(group.genes, names(expr))
has.expr <- which(sapply(expr[expr.locale], length) == 9)
group.expr <- Reduce("rbind", expr[expr.locale[has.expr]])
rownames(group.expr) <- group.genes[has.expr]
#divide expression based on TSS state
group.tss <- tss.states[c(match(rownames(group.expr), rownames(tss.states))),]
#plot(as.vector(group.expr)~as.factor(as.vector(group.tss)), xlab = "State", ylab = "Expression", main = paste("State", state1, "and State", state2))
paired.tss.expr[[i]] <- list("TSS_states" = group.tss, "Group_Expr" = group.expr)
#calculate correlation of two states
state1.freq <- length(state1.locale)/nrow(cor.mat)
state2.freq <- length(state2.locale)/nrow(cor.mat)
if(state1 != state2){
both.freq <- length(both.together)/nrow(cor.mat)
cof.rate <- both.freq - (state1.freq*state2.freq)
pair.cor[state1, state2] <- cof.rate
pair.cor[state2, state1] <- cof.rate
}else{
pair.cor[state1, state2] <- state1.freq
}
}
if(is.interactive){quartz()}
#with individual state frequencies
imageWithText(signif(pair.cor,2), split.at.vals = TRUE,
col.scale = c("blue", "brown"), grad.dir = "ends")
## Loading required package: grid
#Without individual frequencies
diag(pair.cor) <- NA
imageWithText(signif(pair.cor,2), split.at.vals = TRUE,
col.scale = c("blue", "brown"), grad.dir = "ends", col.text.rotation = 0)
state.pair.count <- sapply(all.pair.co.occur, length)
pair.count.order <- order(state.pair.count)
if(is.interactive){quartz()}
barplot(state.pair.count[pair.count.order], las = 2, horiz = TRUE, cex.names = 1)
We looked at the effect of the chromatin state right at the TSS and gene expression. This is a littl different than the other effects of gene expression we’ve looked at. Here we first identified the genes for which different strains had either of two states at the TSS. We then divided the expression for the strains with the first state at the TSS, and the strains with the second state at the TSS. The boxplots show overall differences in expression for the strains with each given state at the TSS.
The figure below shows the expression of all genes across all strains based on the state present at the TSS. Genes with state 7 at the TSS have the highest expression overall.
matched.expr <- expr[match(rownames(tss.states), names(expr))]
has.expr <- which(sapply(matched.expr, length) == 9)
sub.expr <- Reduce("rbind", matched.expr[has.expr])
rownames(sub.expr) <- names(matched.expr)[has.expr]
sub.tss <- tss.states[match(rownames(sub.expr), rownames(tss.states)),]
expr.vector <- unlist(sub.expr)
tss.vector <- unlist(sub.tss)
expr.by.tss <- lapply(1:9, function(x) expr.vector[which(tss.vector == x)])
boxplot(expr.by.tss)
for(i in 1:length(paired.tss.expr)){
if(is.interactive){quartz()}
cat("###", names(paired.tss.expr[i]), "\n")
box.col <- rep("gray", num.states)
box.col[state_pairs[i,]] <- "red"
plot(as.vector(paired.tss.expr[[i]][[2]])~as.factor(as.vector(paired.tss.expr[[i]][[1]])),
main = paste("State", unique(state_pairs[i,])), xlab = "State", ylab = "Expression",
col = box.col)
cat("\n\n")
}
cat("### All Single States\n")
single.locale <- which(apply(state_pairs, 1, function(x) length(unique(x))) == 1)
boxplot(lapply(paired.tss.expr[single.locale], function(x) x[[2]]))
abline(h = median(unlist(paired.tss.expr[single.locale])))
cat("\n\n")
We looked at functional enrichments for genes with each pair of states present at the TSS.
Some of these genes have more than two states at the TSS, so the groups are overlapping somewhat, but using more categories was a bit too disorganized.
Creyghton et al. 2010 found that although H3K4me1 marks cell type-specific enhancers, it marks all enhancers whether they are poised or active. They showed that the presence of H3K27ac distinguishes active enhancers (H3K27ac present) from poised (H3K27ac absent). They suggested that the pattern of these enhancer states could give us clues as to the developmental potential of a cell, or its potential to respond to stimuli.
Can we use this information to look at enhancer patterns across the strains to see which strains could potentially respond to, or are responding to a stimulus, with inflammation, for example?
Let’s think about a case study gene: Irf5. This gene is associated with cytokine response. CAST and PWK have marks of a poised enhancer at the TSS, while the other strains have marks associated with active enhancers. Does this mean that Irf5 has the potential to be expressed in CAST and PWK with the right stimulus, but currently is not, whereas the other strains are all expressing that gene without any stimulus?
What are the enrichments of genes for which some strains have poised enhancers (3, 4, 8), and others have active enhancers (5,6,7) at the TSS?
enrich.file <- here("Results", "chQTL", "State_Pair_Enrichment.RDS")
if(!file.exists(enrich.file)){
state_enrich <- vector(mode = "list", length = length(all.pair.co.occur))
names(state_enrich) <- names(all.pair.co.occur)
for(i in 1:length(all.pair.co.occur)){
enrichment <- gost(all.pair.co.occur[[i]], organism = "mmusculus", sources = "GO")
if(!is.null(enrichment)){
state_enrich[[i]] <- enrichment
}
}
saveRDS(state_enrich, enrich.file)
}else{
state_enrich <- readRDS(enrich.file)
}
for(i in 1:length(state_enrich)){
if(length(state_enrich[[i]]) > 0){
if(is.interactive){quartz()}
cat("###", names(state_enrich[i]), "\n")
#plot.enrichment(state_enrich[[i]], 30, plot.label = paste(names(state_enrich)[i], length(all.pair.co.occur[[i]]), "genes"))
par(mfrow = c(1,2))
plot.enrichment.wordcloud(enrichment = state_enrich[[i]], 10,
max.term.size = 2000)
mtext(paste0("State ", names(all.pair.co.occur)[i], ": ",
length(all.pair.co.occur[[i]]), " genes"), side = 3, outer = TRUE,
line = -2.5)
cat("\n\n")
}
}
state_gene_names <- lapply(all.pair.co.occur,
function(x) transcript.info[match(x,
transcript.info[,"ensembl_gene_id"]),"external_gene_name"])
#pdf("~/Desktop/group_enrich.pdf", width = 9, height = 12)
test <- plot.enrichment.group(state_enrich, transformation = "sqrt")
#dev.off()
pdf(file.path(results.dir, "TSS_Enrichment_Projections.pdf"), height = 15, width = 15)
plot_bipartite(test)
dev.off()
## quartz_off_screen
## 2
state_gene_names$"2_5"
states <- "3_7"
states <- "7_7"
if(is.interactive){quartz()}
pair.locale <- which(names(state_enrich) == states)
plot.enrichment(state_enrich[[pair.locale]], 30,
plot.label = paste(names(state_enrich)[pair.locale],
length(all.pair.co.occur[[pair.locale]]), "genes"),
max.term.size = 2000)
par(mfrow = c(1,2))
plot.enrichment.wordcloud(enrichment = state_enrich[[i]], 10)
gene.name = "Rdh16f1"
id <- transcript.info[which(transcript.info[,"external_gene_name"] == gene.name)[1],"ensembl_gene_id"]
id.locale <- which(names(chrom.mats) == id)
get_tss_states(chrom.mats[[id.locale]])
Use the shiny chromatin viewer to see examples.
Most of the states (1, 2, 3, 7, 8, and 9) have their strongest effects at the TSS, which is easy to isolate. However, states, 4, 5, and 6 have their primary effects in the intragenic region. It would be interesting to look at genes that specifically have these states intragenically, but I’m not sure how to do that. I am going to test some code here.
get_intragenic_state_count <- function(chrom.mat, state){
if(length(chrom.mat) == 1){return(rep(NA, 9))}
intragenic.idx <- intersect(which(as.numeric(rownames(chrom.mat)) > 0), which(as.numeric(rownames(chrom.mat)) < 1))
intra.chrom <- chrom.mat[intragenic.idx,,drop=FALSE]
state.idx <- lapply(1:ncol(intra.chrom), function(x) which(intra.chrom[,x] == state))
state.count <- sapply(state.idx, length)/nrow(intra.chrom)
return(state.count)
}
min.prop <- 0.1
for(state in 4:6){
cat(paste("### State", state, "Intragenic Enrichment\n"))
intragenic.prop <- t(sapply(chrom.mats, function(x) get_intragenic_state_count(x, state)))
colnames(intragenic.prop) <- colnames(chrom.mats[[1]])
no.state <- lapply(1:ncol(intragenic.prop), function(x) which(intragenic.prop[,x] == 0))
names(no.state) <- colnames(intragenic.prop)
some.state <- lapply(1:ncol(intragenic.prop), function(x) which(intragenic.prop[,x] > min.prop))
names(some.state) <- colnames(intragenic.prop)
state.no.enrich.file <- file.path(results.dir, paste0("State_", state, "_intragenic_enrichment_none.RDS"))
if(!file.exists(state.no.enrich.file)){
no.enrich <- lapply(no.state,
function(x) gost(rownames(intragenic.prop)[x], organism = "mmusculus",
sources = c("GO", "KEGG", "REACTOME")))
saveRDS(no.enrich, state.no.enrich.file)
}else{
no.enrich <- readRDS(state.no.enrich.file)
}
state.some.enrich.file <- file.path(results.dir, paste0("State_", state, "_intragenic_enrichment_some.RDS"))
if(!file.exists(state.some.enrich.file)){
some.enrich <- lapply(some.state,
function(x) gost(rownames(intragenic.prop)[x], organism = "mmusculus",
sources = c("GO", "KEGG", "REACTOME")))
saveRDS(some.enrich, state.some.enrich.file)
}else{
some.enrich <- readRDS(state.some.enrich.file)
}
if(is.interactive){quartz()}
plot.enrichment.group(some.enrich, max.term.size = 2000, n.terms = 30,
plot.label = paste("Enrichment of genes with some intragenic state", state))
if(is.interactive){quartz()}
plot.enrichment.group(no.enrich, max.term.size = 2000, n.terms = 30,
plot.label = paste("Enrichment of genes with no intragenic state", state))
some.strains <- which(apply(intragenic.prop, 1, function(x) length(which(x == 0)) > 0 && length(which(x > 0)) > 0))
enrich <- gost(rownames(intragenic.prop)[some.strains], organism = "mmusculus",
sources = c("GO", "KEGG", "REACTOME"))
if(is.interactive){quartz()}
par(mfrow = c(1,2))
plot.enrichment.wordcloud(enrich, num.terms = 30)
mtext(paste("Enrichment of genes with state", state, "in some strains and not others"),
side = 3, outer = TRUE, line = -2.5)
cat("\n\n")
}